EmptyDroplets (FDR <= 0.1) + scDblFindersetwd("/media/jacopo/Elements/re_align/MM/PRJNA723584/SAMN18822746/SRR14295370/")
# Load the libraries (from Sarah script + biomart)
library(tidyverse) # packages for data wrangling, visualization etc
library(Seurat) # scRNA-Seq analysis package
library(clustree) # plot of clustering tree
library(ggsignif) # Enrich your 'ggplots' with group-wise comparisons
library(clusterProfiler) #The package implements methods to analyze and visualize functional profiles of gene and gene clusters.
library(org.Hs.eg.db) # Human annotation package neede for clusterProfiler
library(ggrepel) # extra geoms for ggplo2
library(patchwork) #multiplots
library(reticulate)
Load and do the QC for the cellranger data
#list.files(".")
dat <- Read10X(data.dir ="./out/counts_filtered/")
dat <- CreateSeuratObject(dat) # Create the seurat object from the 10x data
kb.initial <- dat@assays[["RNA"]]@counts@Dim[[2]]
cat("Initial number of cells:", kb.initial,
"\nNumber of genes:", dat@assays[["RNA"]]@counts@Dim[[1]])
## Initial number of cells: 25921
## Number of genes: 36601
Empty cells were already filtered, check for % mt RNA and death markers:
# first calculate the mitochondrial percentage for each cell
dat$percent_mt <- PercentageFeatureSet(dat, pattern="^MT.")
# make violin plots
mt_rna = 5
max_counts = 20000
# Check some feature-feature relationships
# % mt RNA vs n Counts, n Features vs n Counts
# Check some feature-feature relationships
# % mt RNA vs n Counts, n Features vs n Counts
VlnPlot(dat, features = c("nCount_RNA", "nFeature_RNA", "percent_mt")) + geom_hline(yintercept=mt_rna, linetype = "dotted")
plot1 <- FeatureScatter(dat, feature1 = "nCount_RNA", feature2 = "percent_mt")
plot1 <- plot1 + geom_hline(yintercept=mt_rna, linetype = "dotted")
plot2 <- FeatureScatter(dat, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
plot2 <- plot2 + geom_vline(xintercept = max_counts, linetype = "dotted")
plot1
plot2
## cells retained by mt RNA content ( 5 %): 23761
## percentage of retained cells: 91.67 %
## cells retained by counts ( 20000 ): 23630
## percentage of retained cells: 91.16 %
Check the distribution of the cells with low counts and control death markers:
min_counts = 300
hist(dat@meta.data$nCount_RNA, breaks = 100, xlab = "Counts")
hist(dat@meta.data$nCount_RNA, breaks = 1000, xlab = "Counts", xlim = c(0,5000))
hist(dat@meta.data$nCount_RNA, breaks = 10000, xlab = "Counts", xlim = c(0,1000))
abline(v=min_counts, col="red", lty = 3)
The evident peak of cells with < 200 counts could contain dying
cells.
# Subset the dataset to focus only on those cells with low counts
dat.lowcount <- subset(dat, subset = nCount_RNA < min_counts)
# Get the mean of the counts for each gene and sort them decreasing
meanCounts <- rowMeans(GetAssayData(object = dat.lowcount, slot = 'counts'))
meanCounts <- sort(meanCounts, decreasing = T)
# A boxplot can help to observe the distribution of the means
#boxplot(meanCounts)
# Print the most highly expressed genes
head(meanCounts, 30)
## IGKC RPLP1 RPS18 RPS14 MALAT1 IGHG1 RPS19
## 50.5149254 3.2537313 2.6865672 2.2164179 2.0522388 2.0373134 1.8358209
## RPL18A EEF1A1 RPL13 RPL18 RPL7A RPL32 RPS12
## 1.7537313 1.5000000 1.4850746 1.4850746 1.4402985 1.4104478 1.4029851
## RPL29 RPL10 IGHG3 RPS23 RPS28 RPL3 RPS15
## 1.3880597 1.3880597 1.3731343 1.3582090 1.3507463 1.3507463 1.3283582
## RPS27 RPS3 RPL37 RPL28 RPL41 B2M RPS27A
## 1.3208955 1.2089552 1.1641791 1.1044776 1.0671642 1.0522388 1.0373134
## TPT1 RPS5
## 1.0298507 0.9477612
## cells retained by counts ( 300 ): 23495
## percentage of retained cells: 90.64 %
dir.create("result")
saveRDS(dat, file = "./result/SAMN18822746_clean_QC.Rds")
#Normalize
dat <- NormalizeData(dat)
# Find the first 4000 variabe features
dat <- FindVariableFeatures(dat, selection.method = "vst", nfeatures = 4000)
Set mean expression to 0 and variance across 1 to avoid highly expressed genes drive the forwarding analyses. Since negative expression is meaningless, scaled data are useful only for UMAP and clustering
# scale data, the scaled data are saved in:
# dat[["RNA"]]@scale.data
all.genes <- rownames(dat)
dat <- ScaleData(dat, vars.to.regress = c("percent_mt","nCount_RNA"))
dat <- RunPCA(dat, features = VariableFeatures(object = dat), verbose = F, seed.use = 1)
print(dat[["pca"]], dims = 1:5, nfeatures = 5)
## PC_ 1
## Positive: HIST1H4C, RRM2, NUSAP1, TYMS, STMN1
## Negative: RPS14, RPS18, RPS23, RPL18, RPL32
## PC_ 2
## Positive: HLA-B, MALAT1, KLF2, JUN, CD27
## Negative: RPS18, RPL29, RPL18A, RPL7A, RPS5
## PC_ 3
## Positive: MALAT1, RRBP1, RPL34, NEAT1, NCL
## Negative: HIST1H4C, RRM2, STMN1, NUSAP1, TYMS
## PC_ 4
## Positive: KLF2, RPS14, JUN, CXCR4, TNFRSF13B
## Negative: JCHAIN, PTP4A3, MAPKAPK2, RRBP1, IFI30
## PC_ 5
## Positive: B2M, HLA-B, RPS8, ISG15, PPIB
## Negative: MALAT1, RPL32, RPS18, RPS23, KCNQ1OT1
UMAP is a graph-based method of clustering. The first step in this process is to construct a KNN graph based on the euclidean distance in PCA space:
dat <- FindNeighbors(dat, dims = 1:20)
The graph now can be used as input for the function
runUMAP()
dat <- RunUMAP(dat, dims = 1:20, seed.use = 1)
DimPlot(dat, reduction = 'umap', seed = 1)
## QC metrics
## markers